This document will consist of all data used in our pressure cooker. First we will use the code provided by Steffen Greup, to set up and load the raw data files and do some minor data transformations. After that we will add the data of Steffen and Abe, showing us a basic first model. Then we will do some of our own data exploration. Finally we will start constructing our own models and comparing their effectiveness to generalize to new data
rm(list = ls()); graphics.off()
#' - in each comment blok the links refer to websites where you can download the files needed: #nolint
#' - all variable names are in Dutch; you can rename these yourself. In your final presentation #nolint
#' make sure the client know about what variables your are talking in your report #nolint
###########################################################################
# Populatie en adres (location details)
# https://duo.nl/open_onderwijsdata/databestanden/po/adressen/adressen-po-3.jsp
# https://duo.nl/open_onderwijsdata/images/03-alle-vestigingen-bo.csv
###########################################################################
d <- readr::read_csv2("data/input/03-alle-vestigingen-bo.csv") %>%
janitor::clean_names()
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 6215 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (29): PROVINCIE, BRIN NUMMER, VESTIGINGSNUMMER, VESTIGINGSNAAM, STRAATNA...
## dbl (1): BEVOEGD GEZAG NUMMER
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- d %>%
dplyr::select(vestigingsnummer, postcode,
plaatsnaam, gemeentenaam,
gemeentenummer, provincie)
d <- d %>%
dplyr::mutate(brin_nummer = stringr::str_sub(vestigingsnummer, 1, 4),
vestigingsnummer = stringr::str_sub(vestigingsnummer, 5, -1))
###########################################################################
# Aantal leerlingen (number of students)
# https://duo.nl/open_onderwijsdata/databestanden/po/leerlingen-po/po-totaal/leerlingen-po-6.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/06-historische-data-leerlingaantallen-po.xlsx #nolint
###########################################################################
d_n_lln <- readr::read_csv2("data/input/06-historische-data-leerlingaantallen-po.csv", #nolint
guess_max = 10000) %>%
janitor::clean_names()
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 9539 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (32): BRIN_NUMMER, VESTIGINGSNUMMER, INSTELLINGSNAAM_VESTIGING, VEST_SOO...
## dbl (3): BEVOEGD_GEZAG_NUMMER, PEILDATUM, LAATSTE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_n_lln <- d_n_lln %>%
dplyr::select(brin_nummer, vestigingsnummer,
type_po, dplyr::starts_with("leerlingen_")) %>%
dplyr::filter(type_po == "BO") %>%
dplyr::select(-type_po)
d_n_lln <- d_n_lln %>%
tidyr::pivot_longer(cols = dplyr::starts_with("leerlingen_"),
names_to = "jaar",
values_to = "aantal_leerlingen",
names_prefix = "leerlingen_") %>%
dplyr::mutate(jaar = as.integer(jaar),
schooljaar = paste(jaar, jaar + 1, sep = "-"))
###########################################################################
# Toetsresultaten, gemeten in behaalde referentieniveaus (test results)
# https://duo.nl/open_onderwijsdata/databestanden/po/leerlingen-po/bo-sbo/refniveau.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2018-2019.csv #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2017-2018.csv #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2016-2017.csv #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2015-2016.csv #nolint
###########################################################################
d_toets <- list.files("data/input/referentieniveaus/",
full.names = TRUE, pattern = ".csv") %>%
lapply(., FUN = readr::read_csv2,
col_types = cols(VESTIGINGSNUMMER = col_character())) %>%
dplyr::bind_rows() %>%
janitor::clean_names()
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
d_toets <- d_toets %>%
dplyr::mutate(vestigingsnummer = stringr::str_pad(vestigingsnummer,
width = 2, side = "left", pad = "0")) #nolint
d_toets <- d_toets %>%
dplyr::select(peildatum, brin_nummer,
vestigingsnummer,
dplyr::ends_with("1f"),
dplyr::ends_with("2f"), rekenen_1s) %>%
dplyr::mutate(rekenen_2f = rekenen_1s + rekenen_2f) %>%
dplyr::select(-rekenen_1s) %>%
dplyr::mutate(peiljaar = as.integer(stringr::str_sub(as.character(peildatum), 1, 4)), #nolint
schooljaar = paste(peiljaar - 1, peiljaar, sep = "-")) %>%
dplyr::select(-peildatum, -peiljaar)
d_toets <- d_toets %>%
dplyr::mutate(n_observaties = rekenen_lager1f + rekenen_1f
+ rekenen_2f + lv_lager1f
+ lv_1f + lv_2f + tv_lager1f
+ tv_1f + tv_2f)
d_toets <- d_toets %>%
dplyr::mutate(perc_2F = 100 * (rekenen_2f + lv_2f + tv_2f) / n_observaties)
d_toets <- d_toets %>%
dplyr::select(brin_nummer, vestigingsnummer,
schooljaar, n_observaties, perc_2F)
###########################################################################
# Schoolweging toevoegen (what does this mean?)
#
# https://www.cbs.nl/nl-nl/maatwerk/2018/46/gemiddelde-verwachte-schoolscores-2015-2017 #nolint
# https://www.cbs.nl/-/media/_excel/2018/46/181109-gemiddelde-verwachte-schoolscores-def.xlsx #nolint
#
# https://www.cbs.nl/nl-nl/maatwerk/2021/12/schoolweging-2018-2020
# https://www.cbs.nl/-/media/_excel/2021/12/schoolweging-2018-2020.xlsx
###########################################################################
d_weging_2015 <- readxl::read_xlsx("data/input/181109 Gemiddelde verwachte schoolscores-def.xlsx", #nolint
sheet = "Tabel 1", skip = 3) %>%
janitor::clean_names() %>%
dplyr::mutate(schooljaar = "2015-2016") %>%
dplyr::rename(schoolweging = gemiddelde_score)
d_weging_2016 <- readxl::read_xlsx("data/input/181109 Gemiddelde verwachte schoolscores-def.xlsx", #nolint
sheet = "Tabel 2", skip = 3) %>%
janitor::clean_names() %>%
dplyr::mutate(schooljaar = "2016-2017") %>%
dplyr::rename(schoolweging = gemiddelde_score)
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C6313 / R6313C3: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in D6313 / R6313C4: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in E6313 / R6313C5: got '.'
d_weging_2017 <- readxl::read_xlsx("data/input/181109 Gemiddelde verwachte schoolscores-def.xlsx", #nolint
sheet = "Tabel 3", skip = 3) %>%
janitor::clean_names() %>%
dplyr::mutate(schooljaar = "2017-2018") %>%
dplyr::rename(schoolweging = gemiddelde_score)
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C6327 / R6327C3: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in D6327 / R6327C4: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in E6327 / R6327C5: got '.'
d_weging_2018 <- readxl::read_xlsx("data/input/schoolweging-2018-2020.xlsx",
sheet = "Tabel 1", skip = 3) %>%
janitor::clean_names() %>%
dplyr::mutate(schooljaar = "2018-2019")
d_weging_2019 <- readxl::read_xlsx("data/input/schoolweging-2018-2020.xlsx",
sheet = "Tabel 2", skip = 3) %>%
janitor::clean_names() %>%
dplyr::mutate(schooljaar = "2019-2020")
d_weging_2020 <- readxl::read_xlsx("data/input/schoolweging-2018-2020.xlsx",
sheet = "Tabel 3", skip = 3) %>%
janitor::clean_names() %>%
dplyr::mutate(schooljaar = "2020-2021")
d_weging <- dplyr::bind_rows(d_weging_2015, d_weging_2016, d_weging_2017,
d_weging_2018, d_weging_2019, d_weging_2020) %>%
dplyr::select(brin, vestiging, schooljaar,
schoolweging, schoolweging_spreiding = spreiding) %>%
tidyr::drop_na(brin)
rm(d_weging_2015, d_weging_2016, d_weging_2017,
d_weging_2018, d_weging_2019, d_weging_2020)
d_weging <- d_weging %>%
dplyr::rename(brin_nummer = brin,
vestigingsnummer = vestiging)
###########################################################################
# Add open data from CBS, based on postalcodes
###########################################################################
library(cbsodataR)
# all_tables <- cbs_get_toc() #nolint
#
# kerncijfers <- all_tables %>%
# dplyr::filter(stringr::str_detect(ShortTitle, pattern = 'Kerncijfers')) #nolint
if (isFALSE(file.exists("data/input/cbs_open_data.rds"))) {
vraag_cbs_data_op <- function(tabelnummer, jaar) {
x <- cbs_get_data(tabelnummer, select = c("SoortRegio_2",
"Codering_3",
"AantalInwoners_5",
"k_0Tot15Jaar_8",
"Bevolkingsdichtheid_33"))
x <- x %>%
dplyr::mutate(dplyr::across(where(is.character), stringr::str_squish)) %>%
dplyr::filter(SoortRegio_2 == "Gemeente") %>%
dplyr::select(-SoortRegio_2) %>%
dplyr::mutate(jaar = jaar)
return(x)
}
d_cbs <- mapply(FUN = vraag_cbs_data_op,
tabelnummer = c("83487NED", "83765NED", "84286NED",
"84583NED", "84799NED", "85039NED"),
jaar = 2016:2021,
SIMPLIFY = FALSE) %>%
dplyr::bind_rows()
saveRDS(d_cbs, "data/input/cbs_open_data.rds")
} else {
d_cbs <- readRDS("data/input/cbs_open_data.rds")
}
d_cbs <- d_cbs %>%
dplyr::rename(gemeentenummer = Codering_3,
gemeente_aantal_inwoners = AantalInwoners_5,
gemeente_aantal_inwoners_0_15_jaar = k_0Tot15Jaar_8,
gemeente_bevolkingsdichtheid = Bevolkingsdichtheid_33)
d_cbs <- d_cbs %>%
dplyr::mutate(gemeentenummer = stringr::str_sub(gemeentenummer, 3, -1),
gemeente_perc_inwoners_0_tot_15_jaar = 100 * gemeente_aantal_inwoners_0_15_jaar / gemeente_aantal_inwoners, #nolint
schooljaar = paste(jaar - 1, jaar, sep = "-")) %>%
dplyr::select(gemeentenummer, schooljaar, gemeente_aantal_inwoners,
gemeente_perc_inwoners_0_tot_15_jaar,
gemeente_bevolkingsdichtheid)
###########################################################################
# Personeel (staff)
###########################################################################
#############################
# More features
# https://duo.nl/open_onderwijsdata/databestanden/po/onderwijspersoneel/po-personeel2.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/02-onderwijspersoneel-po-in-fte-2011-2020.xlsx #nolint
#############################
d_personeel <- readxl::read_xlsx("data/input/02-onderwijspersoneel-po-in-fte-2011-2020.xlsx", #nolint
sheet = "per owtype-bestuur-brin-functie") %>%
janitor::clean_names() %>%
dplyr::select(-onderwijstype, -bevoegd_gezag)
d_personeel <- d_personeel %>%
dplyr::mutate(functiegroep = dplyr::case_when(
functiegroep == "Directie" ~ "dir",
functiegroep == "Leraren in opleiding (LIO)" ~ "lio",
functiegroep == "Onbekend" ~ "onbekend",
functiegroep == "Onderwijsgevend personeel" ~ "op",
functiegroep == "Onderwijsondersteunend personeel (OOP/OBP)" ~ "oop_obp",
))
# wanneer er geen data is staat er bij de gemiddeldes 0 ipv een missing
d_personeel <- d_personeel %>%
dplyr::mutate(across(dplyr::starts_with("gemiddelde_"),
~ifelse(. == 0, yes = NA_real_, no = .)))
d_personeel <- d_personeel %>%
dplyr::filter(stringr::str_to_lower(brin_nummer) != "bovenschools")
d_personeel <- d_personeel %>%
tidyr::pivot_longer(cols = -c(brin_nummer, functiegroep),
names_to = "variabele_en_jaar",
values_to = "waarde") %>%
dplyr::mutate(jaar = stringr::str_sub(variabele_en_jaar,
start = -4, end = -1),
jaar = as.integer(jaar),
variabele = stringr::str_sub(variabele_en_jaar,
start = 1, end = -6)) %>%
dplyr::select(-variabele_en_jaar)
d_personeel <- d_personeel %>%
tidyr::pivot_wider(names_from = variabele,
values_from = waarde)
d_personeel_a <- d_personeel %>%
dplyr::filter(functiegroep == "op") %>%
dplyr::mutate(perc_man = 100 * ftes_mannen / ftes,
perc_tijdelijk_contract = 100 * ftes_personen_in_tijdelijke_dienst / ftes) %>% #nolint
dplyr::select(brin_nummer, jaar, ftes,
gemiddelde_aanstelling = gemiddelde_ftes,
gemiddelde_leeftijd, perc_man, perc_tijdelijk_contract)
# gemiddelde_ftes, # gemiddelde_leeftijd
d_personeel_b <- d_personeel %>%
dplyr::filter(functiegroep == "oop_obp") %>%
dplyr::select(brin_nummer, jaar, ftes_ondersteunend = ftes)
d_personeel_tot <- d_personeel_a %>%
dplyr::left_join(d_personeel_b, by = c("brin_nummer", "jaar")) %>%
dplyr::mutate(ratio_leraar_ondersteuner = ftes / ftes_ondersteunend,
schooljaar = paste(jaar, jaar + 1, sep = "-")) %>%
dplyr::select(-ftes, -ftes_ondersteunend, -jaar)
rm(d_personeel_a, d_personeel_b)
#############################
# leerling-leraar ratio's
# (teacher-student ratio)
# https://duo.nl/open_onderwijsdata/databestanden/po/onderwijspersoneel/po-personeel3.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/03-leerling-leraarratio-po-per-instelling.xlsx #nolint
#############################
d_leraar_leerling <- readxl::read_xlsx("data/input/03-leerling-leraarratio-po-per-instelling.xlsx", #nolint
sheet = "7. ratios-regio-bestuur-brin") %>% #nolint
janitor::clean_names()
d_leraar_leerling <- d_leraar_leerling %>%
dplyr::filter(o == "BAO") %>%
dplyr::select(brin_nummer = brin, dplyr::starts_with("ratio_"))
d_leraar_leerling <- d_leraar_leerling %>%
tidyr::pivot_longer(cols = dplyr::starts_with("ratio_"),
names_to = "jaar",
values_to = "ratio_leerling_leraar",
names_prefix = "ratio_") %>%
dplyr::mutate(jaar = as.integer(jaar),
schooljaar = paste(jaar, jaar + 1, sep = "-")) %>%
dplyr::select(-jaar)
###################
# Combineren (combine data)
# Ik beperk de dataset nu tot de jaren waar ook de toetsresultaten beschikbaar zijn #nolint
# (this join makes a subset of data based on availability in d_toets)
# Sommige databronnen hebben ook oudere gegevens, die worden nu dus niet meegenomen. #nolint
###################
d_tot <- d_toets %>%
dplyr::left_join(d, by = c("brin_nummer", "vestigingsnummer")) %>% #nolint
dplyr::left_join(d_cbs, by = c("gemeentenummer", "schooljaar")) %>% #nolint
dplyr::left_join(d_leraar_leerling, by = c("brin_nummer", "schooljaar")) %>% #nolint
dplyr::left_join(d_personeel_tot, by = c("brin_nummer", "schooljaar")) %>% #nolint
dplyr::left_join(d_n_lln, by = c("brin_nummer", "vestigingsnummer", "schooljaar")) %>% #nolint
dplyr::left_join(d_weging, by = c("brin_nummer", "vestigingsnummer", "schooljaar")) #nolint
Now that all the data is loaded and a combined dataframe is made, we can start to build a very simple model. Here, an example by Steffen is provided:
#############################
# Voorbeeldmodelletje om uitkomstvariabele en correctie
# voor leerlingpopulatie (schoolweging) te laten zien
# (play model provided by the client)
#############################
d_tot <- d_tot %>%
dplyr::mutate(prop_2F = perc_2F / 100)
mod1 <- glm(prop_2F ~ scale(schoolweging),
weights = n_observaties,
family = binomial(),
data = d_tot)
predict(mod1, newdata = tibble(schoolweging = c(25, 35)), type = "response")
## 1 2
## 0.6682653 0.4974504
Up next we use play code 1, also provided by Steffen. This code shows an example of how to make a model that uses differences in Perc_2F as an outcome variable for a model using various predictors
# ------------------------------------------
# play code to get you started
# needed opbjects:
# - d_toets should be available in the workspace after running make_dataset.
# - d_personeel_tot should be available in the workspace after running make_dataset. #nolint
# [it might be that this is not the best way to prepare your dataset!]
# here I reshape to wide format and calculate three diff variables:
tmp_file <- d_toets %>%
select(-n_observaties) %>%
pivot_wider(names_from = schooljaar, values_from = perc_2F) %>%
mutate(diff1 = `2016-2017` - `2015-2016`,
diff2 = `2017-2018` - `2016-2017`,
diff3 = `2018-2019` - `2017-2018`)
# some quick inspections; look very normal so lets do some linear modelling
tmp_file %>%
select(diff1, diff2, diff3) %>%
ggpairs(aes(alpha = .1))
## Warning: Removed 387 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 582 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 720 rows containing missing values
## Warning: Removed 582 rows containing missing values (geom_point).
## Warning: Removed 513 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 662 rows containing missing values
## Warning: Removed 720 rows containing missing values (geom_point).
## Warning: Removed 662 rows containing missing values (geom_point).
## Warning: Removed 589 rows containing non-finite values (stat_density).
# first model
# is the growth related to the scores on 20217-2018?
# if yes and negative effect, this could indicate regr towards the mean.
mod <- lm(diff3 ~ scale(`2017-2018`), data = tmp_file) # regr towards the mean!
summary(mod)
##
## Call:
## lm(formula = diff3 ~ scale(`2017-2018`), data = tmp_file)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.753 -7.777 0.650 8.242 48.370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8808 0.1586 5.553 2.92e-08 ***
## scale(`2017-2018`) -8.6848 0.1597 -54.367 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.3 on 6011 degrees of freedom
## (589 observations deleted due to missingness)
## Multiple R-squared: 0.3296, Adjusted R-squared: 0.3295
## F-statistic: 2956 on 1 and 6011 DF, p-value: < 2.2e-16
# now add some more data:
tmp_file <- d_personeel_tot %>%
filter(schooljaar == "2017-2018") %>%
right_join(tmp_file, by = c("brin_nummer"))
# now add some more data:
tmp_file <- d_weging %>%
filter(schooljaar == "2017-2018") %>%
right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer"))
# extend the model
mod <- lm(diff3 ~ 1
+ scale(`2017-2018`)
+ scale(schoolweging)
+ scale(perc_man)
+ scale(gemiddelde_aanstelling)
+ scale(perc_tijdelijk_contract)
, data = tmp_file) # regr towards the mean
summary(mod)
##
## Call:
## lm(formula = diff3 ~ 1 + scale(`2017-2018`) + scale(schoolweging) +
## scale(perc_man) + scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract),
## data = tmp_file)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.193 -7.283 0.181 7.484 44.514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.87639 0.15157 5.782 7.76e-09 ***
## scale(`2017-2018`) -10.62431 0.16951 -62.677 < 2e-16 ***
## scale(schoolweging) -4.34980 0.16846 -25.821 < 2e-16 ***
## scale(perc_man) 0.08159 0.15703 0.520 0.603403
## scale(gemiddelde_aanstelling) -0.59366 0.15957 -3.720 0.000201 ***
## scale(perc_tijdelijk_contract) -0.32434 0.15743 -2.060 0.039425 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.52 on 5777 degrees of freedom
## (819 observations deleted due to missingness)
## Multiple R-squared: 0.4057, Adjusted R-squared: 0.4052
## F-statistic: 788.8 on 5 and 5777 DF, p-value: < 2.2e-16
# model selection:
mod_step <- step(mod)
## Start: AIC=28270.15
## diff3 ~ 1 + scale(`2017-2018`) + scale(schoolweging) + scale(perc_man) +
## scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract)
##
## Df Sum of Sq RSS AIC
## - scale(perc_man) 1 36 766156 28268
## <none> 766120 28270
## - scale(perc_tijdelijk_contract) 1 563 766683 28272
## - scale(gemiddelde_aanstelling) 1 1836 767956 28282
## - scale(schoolweging) 1 88419 854539 28900
## - scale(`2017-2018`) 1 520964 1287084 31268
##
## Step: AIC=28268.42
## diff3 ~ scale(`2017-2018`) + scale(schoolweging) + scale(gemiddelde_aanstelling) +
## scale(perc_tijdelijk_contract)
##
## Df Sum of Sq RSS AIC
## <none> 766156 28268
## - scale(perc_tijdelijk_contract) 1 576 766732 28271
## - scale(gemiddelde_aanstelling) 1 1801 767957 28280
## - scale(schoolweging) 1 88456 854612 28898
## - scale(`2017-2018`) 1 521875 1288031 31271
summary(mod_step)
##
## Call:
## lm(formula = diff3 ~ scale(`2017-2018`) + scale(schoolweging) +
## scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract),
## data = tmp_file)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.302 -7.279 0.205 7.498 44.572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8768 0.1516 5.785 7.63e-09 ***
## scale(`2017-2018`) -10.6198 0.1693 -62.736 < 2e-16 ***
## scale(schoolweging) -4.3505 0.1684 -25.828 < 2e-16 ***
## scale(gemiddelde_aanstelling) -0.5801 0.1574 -3.685 0.000231 ***
## scale(perc_tijdelijk_contract) -0.3278 0.1573 -2.084 0.037185 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.52 on 5778 degrees of freedom
## (819 observations deleted due to missingness)
## Multiple R-squared: 0.4057, Adjusted R-squared: 0.4053
## F-statistic: 986 on 4 and 5778 DF, p-value: < 2.2e-16
Next up, we show the example provided by Abe. This model corrects for the effect of schoolweging, when predicting the difference in Perc_2f scores.
# --------------------------------------------------------
# play model 2.0 [after client interview]
# predict change in 2016-2017 to 2017-2018
# corrected for schoolweging
# including the score at 2016-2017; to model the regr. towards the mean.
tmp_file_2 <- d_weging %>%
rename("schoolweging_2016_2017" = "schoolweging") %>%
filter(schooljaar == "2017-2018") %>%
right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer")) %>%
rename("schoolweging_2017_2018" = "schoolweging")
# missing data issues..
tmp_file_clean <- tmp_file_2 %>%
filter(!is.na(`2016-2017`),
!is.na(`2017-2018`),
!is.na(schoolweging_2016_2017),
!is.na(schoolweging_2017_2018))
mod <- lm(`2016-2017` ~ 1 + schoolweging_2016_2017, data = tmp_file_clean)
tmp_file_clean$res_2016_2017 <- residuals(mod)
mod <- lm(`2017-2018` ~ 1 + schoolweging_2017_2018, data = tmp_file_clean)
tmp_file_clean$res_2017_2018 <- residuals(mod)
tmp_file_clean$diff_corrected_3 <- tmp_file_clean$res_2017_2018 - tmp_file_clean$res_2016_2017 #nolint
# extend the model
mod <- lm(diff_corrected_3 ~ 1
+ scale(res_2016_2017)
# + scale(schoolweging) #nolint
+ scale(perc_man)
+ scale(gemiddelde_aanstelling)
+ scale(perc_tijdelijk_contract)
, data = tmp_file_clean) # regr towards the mean
summary(mod)
##
## Call:
## lm(formula = diff_corrected_3 ~ 1 + scale(res_2016_2017) + scale(perc_man) +
## scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract),
## data = tmp_file_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.383 -7.727 0.255 7.793 43.761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08447 0.15697 -0.538 0.590492
## scale(res_2016_2017) -9.83952 0.15724 -62.578 < 2e-16 ***
## scale(perc_man) 0.58373 0.15913 3.668 0.000246 ***
## scale(gemiddelde_aanstelling) -0.22331 0.15917 -1.403 0.160674
## scale(perc_tijdelijk_contract) -0.49114 0.15734 -3.122 0.001808 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.94 on 5780 degrees of freedom
## (215 observations deleted due to missingness)
## Multiple R-squared: 0.4043, Adjusted R-squared: 0.4039
## F-statistic: 980.6 on 4 and 5780 DF, p-value: < 2.2e-16
summary(step(mod))
## Start: AIC=28696.16
## diff_corrected_3 ~ 1 + scale(res_2016_2017) + scale(perc_man) +
## scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract)
##
## Df Sum of Sq RSS AIC
## - scale(gemiddelde_aanstelling) 1 281 824125 28696
## <none> 823844 28696
## - scale(perc_tijdelijk_contract) 1 1389 825233 28704
## - scale(perc_man) 1 1918 825762 28708
## - scale(res_2016_2017) 1 558158 1382003 31687
##
## Step: AIC=28696.13
## diff_corrected_3 ~ scale(res_2016_2017) + scale(perc_man) + scale(perc_tijdelijk_contract)
##
## Df Sum of Sq RSS AIC
## <none> 824125 28696
## - scale(perc_tijdelijk_contract) 1 1454 825579 28704
## - scale(perc_man) 1 1737 825862 28706
## - scale(res_2016_2017) 1 557928 1382053 31685
##
## Call:
## lm(formula = diff_corrected_3 ~ scale(res_2016_2017) + scale(perc_man) +
## scale(perc_tijdelijk_contract), data = tmp_file_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.318 -7.685 0.294 7.846 43.692
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08445 0.15698 -0.538 0.590642
## scale(res_2016_2017) -9.83248 0.15717 -62.560 < 2e-16 ***
## scale(perc_man) 0.54864 0.15716 3.491 0.000485 ***
## scale(perc_tijdelijk_contract) -0.50197 0.15716 -3.194 0.001411 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.94 on 5781 degrees of freedom
## (215 observations deleted due to missingness)
## Multiple R-squared: 0.4041, Adjusted R-squared: 0.4038
## F-statistic: 1307 on 3 and 5781 DF, p-value: < 2.2e-16
Then, for our first look at the data we did some data exploration. Here we looked at anomalies in the data, as well as distributions and other points of interest that we may use in making our models later on.
We inspect the total dataset d_tot
head(d_tot)
## # A tibble: 6 × 24
## brin_nummer vestigingsnummer schooljaar n_observaties perc_2F postcode
## <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 00AP 00 2015-2016 36 66.7 2716 PH
## 2 00AR 00 2015-2016 51 76.5 6107 AW
## 3 00AV 00 2015-2016 21 33.3 3201 CN
## 4 00AZ 00 2015-2016 57 66.7 2971 AR
## 5 00BA 00 2015-2016 72 65.3 <NA>
## 6 00BB 00 2015-2016 45 60 9944 AR
## # … with 18 more variables: plaatsnaam <chr>, gemeentenaam <chr>,
## # gemeentenummer <chr>, provincie <chr>, gemeente_aantal_inwoners <int>,
## # gemeente_perc_inwoners_0_tot_15_jaar <dbl>,
## # gemeente_bevolkingsdichtheid <int>, ratio_leerling_leraar <dbl>,
## # gemiddelde_aanstelling <dbl>, gemiddelde_leeftijd <dbl>, perc_man <dbl>,
## # perc_tijdelijk_contract <dbl>, ratio_leraar_ondersteuner <dbl>, jaar <int>,
## # aantal_leerlingen <chr>, schoolweging <dbl>, …
Next we want to make sure that all the variables are of the correct datatype and if not, we should change that
lapply(d_tot, class)
## $brin_nummer
## [1] "character"
##
## $vestigingsnummer
## [1] "character"
##
## $schooljaar
## [1] "character"
##
## $n_observaties
## [1] "numeric"
##
## $perc_2F
## [1] "numeric"
##
## $postcode
## [1] "character"
##
## $plaatsnaam
## [1] "character"
##
## $gemeentenaam
## [1] "character"
##
## $gemeentenummer
## [1] "character"
##
## $provincie
## [1] "character"
##
## $gemeente_aantal_inwoners
## [1] "integer"
##
## $gemeente_perc_inwoners_0_tot_15_jaar
## [1] "numeric"
##
## $gemeente_bevolkingsdichtheid
## [1] "integer"
##
## $ratio_leerling_leraar
## [1] "numeric"
##
## $gemiddelde_aanstelling
## [1] "numeric"
##
## $gemiddelde_leeftijd
## [1] "numeric"
##
## $perc_man
## [1] "numeric"
##
## $perc_tijdelijk_contract
## [1] "numeric"
##
## $ratio_leraar_ondersteuner
## [1] "numeric"
##
## $jaar
## [1] "integer"
##
## $aantal_leerlingen
## [1] "character"
##
## $schoolweging
## [1] "numeric"
##
## $schoolweging_spreiding
## [1] "numeric"
##
## $prop_2F
## [1] "numeric"
We notice that “aantal leerlingen” is a character variable, while it should be numeric. We transform this:
length(which(is.na(d_tot$aantal_leerlingen)))
## [1] 1007
length(which(is.na(as.numeric(d_tot$aantal_leerlingen))))
## Warning in which(is.na(as.numeric(d_tot$aantal_leerlingen))): NAs introduced by
## coercion
## [1] 1008
# one NA is added when transforming to numeric
d_tot <- d_tot %>%
mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
class(d_tot$aantal_leerlingen)
## [1] "numeric"
Up next, we tackle the issue of missing data. In this dataset there are 2 types of missing or wrong data. The first is NA data (missing values), the second is Inf data (which can occur when calculating a proportion and having to divide by 0).
# Number of observations
nrow(d_tot)
## [1] 25718
# remove NAs
d_tot <- d_tot %>%
drop_na()
# Number of observations after removing NAs
nrow(d_tot)
## [1] 22594
For some further exploration of data we look at some of our variables and their distributions
d_tot %>% count(vestigingsnummer)
## # A tibble: 5 × 2
## vestigingsnummer n
## <chr> <int>
## 1 00 22249
## 2 01 289
## 3 02 38
## 4 03 15
## 5 04 3
d_tot %>% count(schooljaar)
## # A tibble: 4 × 2
## schooljaar n
## <chr> <int>
## 1 2015-2016 5558
## 2 2016-2017 5579
## 3 2017-2018 5552
## 4 2018-2019 5905
d_tot %>% select(perc_2F) %>% summary()
## perc_2F
## Min. : 0.00
## 1st Qu.: 50.00
## Median : 59.65
## Mean : 58.53
## 3rd Qu.: 68.33
## Max. :100.00
d_tot %>% select(schoolweging) %>% summary()
## schoolweging
## Min. :18.90
## 1st Qu.:27.80
## Median :29.80
## Mean :29.91
## 3rd Qu.:31.90
## Max. :41.51
Now that we have a better understanding of the data, we will be taking a look at some of the distributions. Maybe we can already see if there are any high correlations or remarkable distributions among the quantitative variables
d_tot %>% select(perc_2F, n_observaties, perc_man, aantal_leerlingen,
ratio_leraar_ondersteuner, ratio_leerling_leraar,
gemeente_perc_inwoners_0_tot_15_jaar,
schoolweging_spreiding, gemiddelde_aanstelling) %>%
ggpairs(aes(alpha = .1))
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1644 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_text).
Now we look at the individual distributions of the quantitative variables. We start with the normally distributed variables:
d_tot %>% ggplot(aes(prop_2F)) + geom_density()
d_tot %>% ggplot(aes(schoolweging)) + geom_density()
d_tot %>% ggplot(aes(schoolweging_spreiding)) + geom_density()
d_tot %>% ggplot(aes(ratio_leerling_leraar)) + geom_density()
d_tot %>% ggplot(aes(gemiddelde_aanstelling)) + geom_density()
d_tot %>% ggplot(aes(gemiddelde_leeftijd)) + geom_density()
d_tot %>% ggplot(aes(aantal_leerlingen)) + geom_density()
d_tot %>% ggplot(aes(gemeente_perc_inwoners_0_tot_15_jaar)) + geom_density()
There were also a few variables that weren’t normally distributed, these are plotted below:
d_tot %>% ggplot(aes(perc_man)) + geom_density()
d_tot %>% ggplot(aes(perc_tijdelijk_contract)) + geom_density()
d_tot %>% ggplot(aes(ratio_leraar_ondersteuner)) + geom_density()
## Warning: Removed 1644 rows containing non-finite values (stat_density).
d_tot %>% ggplot(aes(gemeente_aantal_inwoners)) + geom_density() # transform
d_tot %>% ggplot(aes(gemeente_bevolkingsdichtheid)) + geom_density() #transform
The first thing we noticed when looking at Abe’s data is that he used diff3_corrected to make a calculation for the year which corresponds with diff2. So we corrected this when making the dataframe for the year 2017-2018. We then do some more transformations to make our final training data.
tmp_file_clean$diff_corrected_2 <- tmp_file_clean$res_2017_2018 -
tmp_file_clean$res_2016_2017
df_train <- tmp_file_clean %>%
select(- diff_corrected_3)
# Train data frame
df_train <- df_train %>%
select(brin_nummer, vestigingsnummer, `2015-2016`, `2016-2017`, `2017-2018`,
`2018-2019`, diff1, diff2, diff3, res_2016_2017, res_2017_2018,
diff_corrected_2)
train_tot <- d_tot %>%
filter(schooljaar == "2017-2018") %>%
left_join(df_train, by = c("brin_nummer", "vestigingsnummer")) %>%
mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))
train_tot <- train_tot[complete.cases(train_tot), ]
train_tot <- train_tot %>%
rename("y2015_2016" = "2015-2016",
"y2016_2017" = "2016-2017",
"y2017_2018" = "2017-2018",
"y2018_2019" = "2018-2019") %>%
mutate_if(is.numeric, ~ifelse(abs(.) == Inf, 0, .)) %>%
mutate_if(is.numeric, ~ifelse(abs(.) == -Inf, 0, .))
To create the test data, we essentially create the same data frame as the one made above (by Abe), but this time looking at data from 2018-2019:
# now add some more data:
test_data <- d_personeel_tot %>%
filter(schooljaar == "2018-2019") %>%
right_join(tmp_file, by = c("brin_nummer"))
# now add some more data:
test_data <- d_weging %>%
filter(schooljaar == "2018-2019") %>%
right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer"))
test_data <- d_weging %>%
rename("schoolweging_2017_2018" = "schoolweging") %>%
filter(schooljaar == "2018-2019") %>%
right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer")) %>%
rename("schoolweging_2018_2019" = "schoolweging")
# missing data issues..
clean_test_data <- test_data %>%
filter(!is.na(`2017-2018`),
!is.na(`2018-2019`),
!is.na(schoolweging_2017_2018),
!is.na(schoolweging_2018_2019))
mod <- lm(`2017-2018` ~ 1 + schoolweging_2017_2018, data = clean_test_data)
clean_test_data$res_2017_2018 <- residuals(mod)
mod <- lm(`2018-2019` ~ 1 + schoolweging_2018_2019, data = clean_test_data)
clean_test_data$res_2018_2019 <- residuals(mod)
clean_test_data$diff_corrected_3 <- clean_test_data$res_2018_2019 -
clean_test_data$res_2017_2018
df_test <- clean_test_data %>%
select(brin_nummer, vestigingsnummer, `2015-2016`, `2016-2017`, `2017-2018`,
`2018-2019`, diff1, diff2, diff3, res_2017_2018, res_2018_2019,
diff_corrected_3)
test_tot <- d_tot %>%
filter(schooljaar == "2018-2019") %>%
left_join(df_test, by = c("brin_nummer", "vestigingsnummer")) %>%
mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))
test_tot <- test_tot[complete.cases(test_tot), ]
test_tot <- test_tot %>%
rename("y2015_2016" = "2015-2016",
"y2016_2017" = "2016-2017",
"y2017_2018" = "2017-2018",
"y2018_2019" = "2018-2019") %>%
mutate_if(is.numeric, ~ifelse(abs(.) == Inf, 0, .)) %>%
mutate_if(is.numeric, ~ifelse(abs(.) == -Inf, 0, .))
Now that we have our training data and our test data, we will see if we need to delete any columns due to a variance of nearly 0.
getNearZeroVariationPredictors <- function(df) {
nzv_predictors <- df %>%
nearZeroVar(names = TRUE)
if (length(nzv_predictors) == 0) {
return(0L)
} else {
return(nzv_predictors)
}
}
nzv_train <- getNearZeroVariationPredictors(train_tot)
nzv_train
## [1] "vestigingsnummer" "schooljaar" "jaar"
It appears there are some variables which we should delete. However, these are not variables we intend to use in our models. So we choose to ignore the outcome here. Next, we look at highly correlated columns.
high_cor <- train_tot %>%
select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer)) %>%
cor() %>%
findCorrelation(.9, names = TRUE)
high_cor
## [1] "perc_2F" "prop_2F" "diff2"
We now make a dataframe which we can use to create our models. First the test data (from 2017-2018).
train_model <- train_tot %>%
select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer,
diff2, diff3,
# remove variables with high correlation
perc_2F, y2015_2016,
# these predictors make the r-squared 1
y2017_2018, y2018_2019, y2016_2017,
res_2016_2017, res_2017_2018))
colnames(train_model)
## [1] "n_observaties"
## [2] "gemeente_aantal_inwoners"
## [3] "gemeente_perc_inwoners_0_tot_15_jaar"
## [4] "gemeente_bevolkingsdichtheid"
## [5] "ratio_leerling_leraar"
## [6] "gemiddelde_aanstelling"
## [7] "gemiddelde_leeftijd"
## [8] "perc_man"
## [9] "perc_tijdelijk_contract"
## [10] "ratio_leraar_ondersteuner"
## [11] "aantal_leerlingen"
## [12] "schoolweging"
## [13] "schoolweging_spreiding"
## [14] "prop_2F"
## [15] "diff1"
## [16] "diff_corrected_2"
As well as the test data from 2018-2019. We rename diff2 to diff1 so it matches the column names of the training data. But we know it to be diff2.
test_model <- test_tot %>%
select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer,
diff1, diff3,
# remove variables with high correlation
perc_2F, y2015_2016,
# these predictors make the r-squared 1
y2017_2018, y2018_2019, y2016_2017,
res_2017_2018, res_2018_2019)) %>%
rename("diff1" = "diff2")
colnames(test_model)
## [1] "n_observaties"
## [2] "gemeente_aantal_inwoners"
## [3] "gemeente_perc_inwoners_0_tot_15_jaar"
## [4] "gemeente_bevolkingsdichtheid"
## [5] "ratio_leerling_leraar"
## [6] "gemiddelde_aanstelling"
## [7] "gemiddelde_leeftijd"
## [8] "perc_man"
## [9] "perc_tijdelijk_contract"
## [10] "ratio_leraar_ondersteuner"
## [11] "aantal_leerlingen"
## [12] "schoolweging"
## [13] "schoolweging_spreiding"
## [14] "prop_2F"
## [15] "diff1"
## [16] "diff_corrected_3"
For our linear model we will be using a forward stepwise approach. We first define a nullmodel and a full model.
nullmod <- lm(diff_corrected_2 ~ 1, data = train_model)
fullmod <- lm(diff_corrected_2 ~., data = train_model)
summary(fullmod)
##
## Call:
## lm(formula = diff_corrected_2 ~ ., data = train_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.690 -6.551 -0.602 6.156 63.539
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.786e+01 3.299e+00 -23.601 < 2e-16 ***
## n_observaties -7.157e-03 6.212e-03 -1.152 0.24935
## gemeente_aantal_inwoners -8.877e-07 1.106e-06 -0.802 0.42236
## gemeente_perc_inwoners_0_tot_15_jaar -1.295e-01 7.751e-02 -1.671 0.09481 .
## gemeente_bevolkingsdichtheid 4.004e-04 1.367e-04 2.930 0.00341 **
## ratio_leerling_leraar -1.443e-01 4.692e-02 -3.075 0.00211 **
## gemiddelde_aanstelling 1.213e+00 1.849e+00 0.656 0.51183
## gemiddelde_leeftijd -5.438e-02 3.273e-02 -1.661 0.09672 .
## perc_man -1.753e-02 1.482e-02 -1.183 0.23669
## perc_tijdelijk_contract 2.214e-03 1.590e-02 0.139 0.88924
## ratio_leraar_ondersteuner -2.904e-03 6.627e-03 -0.438 0.66120
## aantal_leerlingen -8.720e-04 2.377e-03 -0.367 0.71376
## schoolweging 1.089e+00 4.452e-02 24.461 < 2e-16 ***
## schoolweging_spreiding 8.224e-01 1.856e-01 4.430 9.6e-06 ***
## prop_2F 7.918e+01 1.154e+00 68.622 < 2e-16 ***
## diff1 -3.955e-01 8.590e-03 -46.046 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.22 on 5368 degrees of freedom
## Multiple R-squared: 0.5492, Adjusted R-squared: 0.5479
## F-statistic: 436 on 15 and 5368 DF, p-value: < 2.2e-16
Then we use the step function to obtain the best model.
finalmod <- MASS::stepAIC(nullmod,
scope = list(upper = fullmod),
direction = "forward", trace = FALSE)
finalmod
##
## Call:
## lm(formula = diff_corrected_2 ~ prop_2F + diff1 + schoolweging +
## schoolweging_spreiding + ratio_leerling_leraar + gemeente_bevolkingsdichtheid +
## n_observaties + gemiddelde_leeftijd + gemeente_perc_inwoners_0_tot_15_jaar,
## data = train_model)
##
## Coefficients:
## (Intercept) prop_2F
## -7.725e+01 7.914e+01
## diff1 schoolweging
## -3.957e-01 1.091e+00
## schoolweging_spreiding ratio_leerling_leraar
## 8.245e-01 -1.414e-01
## gemeente_bevolkingsdichtheid n_observaties
## 3.312e-04 -9.170e-03
## gemiddelde_leeftijd gemeente_perc_inwoners_0_tot_15_jaar
## -5.839e-02 -1.314e-01
summary(finalmod)
##
## Call:
## lm(formula = diff_corrected_2 ~ prop_2F + diff1 + schoolweging +
## schoolweging_spreiding + ratio_leerling_leraar + gemeente_bevolkingsdichtheid +
## n_observaties + gemiddelde_leeftijd + gemeente_perc_inwoners_0_tot_15_jaar,
## data = train_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.556 -6.614 -0.623 6.139 63.464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.725e+01 2.973e+00 -25.979 < 2e-16 ***
## prop_2F 7.914e+01 1.151e+00 68.729 < 2e-16 ***
## diff1 -3.957e-01 8.581e-03 -46.111 < 2e-16 ***
## schoolweging 1.091e+00 4.342e-02 25.132 < 2e-16 ***
## schoolweging_spreiding 8.245e-01 1.847e-01 4.465 8.19e-06 ***
## ratio_leerling_leraar -1.414e-01 4.600e-02 -3.074 0.002121 **
## gemeente_bevolkingsdichtheid 3.313e-04 9.593e-05 3.453 0.000558 ***
## n_observaties -9.170e-03 3.014e-03 -3.043 0.002356 **
## gemiddelde_leeftijd -5.839e-02 3.093e-02 -1.888 0.059113 .
## gemeente_perc_inwoners_0_tot_15_jaar -1.314e-01 7.571e-02 -1.735 0.082775 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.21 on 5374 degrees of freedom
## Multiple R-squared: 0.549, Adjusted R-squared: 0.5482
## F-statistic: 726.9 on 9 and 5374 DF, p-value: < 2.2e-16
We Use this model to predict the test data and obtain the RMSE. We also plot the predicted data and the observed data.
pred_lm <- predict(finalmod, newdata = test_model)
RMSE_lm <- RMSE(pred = pred_lm, obs = test_model$diff_corrected_3)
RMSE_lm
## [1] 9.466832
tibble(pred_lm = pred_lm, diff_corrected_3 = test_model$diff_corrected_3) %>%
ggplot(aes(x = pred_lm, y = diff_corrected_3)) +
geom_point(alpha = .2) +
geom_abline() +
labs(title = "Predictions vs Observations", y = "Corrected Difference of
Observations (2018/2019 - 2017/2018)", x = "LM Predictions",
subtitle = "Using data to predict 1 year into the future",
caption = "Data Source: Inspectorate of Education") +
theme_minimal() +
geom_text(x = 30, y = -30, size = 4,
label = paste("RMSE = ", round(RMSE_lm, 3)))
fit <- finalmod
plot(fit, which = 1) # should be red straight line at 0
res <- lmtest::resettest(fit) # if significant --> assumption violated
res
##
## RESET test
##
## data: fit
## RESET = 8.8826, df1 = 2, df2 = 5372, p-value = 0.0001408
round(res$statistic, 5)
## RESET
## 8.88257
round(res$p.value, 5)
## [1] 0.00014
plot(fit, which = 3) # scale-location plot -> should be red straight line at 1
ncv <- car::ncvTest(fit) # if significant --> assumption violated
ncv
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.2477182, Df = 1, p = 0.61869
round(ncv$ChiSquare, 5)
## [1] 0.24772
round(ncv$p, 5)
## [1] 0.61869
plot(fit, which = 2)
#shp <- shapiro.test(resid(fit)) # if significant assumption is violated
round(car::vif(fit), 5) # > 10 suspicious
## prop_2F diff1
## 1.24697 1.00525
## schoolweging schoolweging_spreiding
## 1.38868 1.09715
## ratio_leerling_leraar gemeente_bevolkingsdichtheid
## 1.15799 1.17730
## n_observaties gemiddelde_leeftijd
## 1.21692 1.02232
## gemeente_perc_inwoners_0_tot_15_jaar
## 1.04909
plot(fit, which = 5) # Residuals vs Leverage plot
abline(v = 2 * mean(hatvalues(fit)), col = 3, lty = 3)
sum(hatvalues(fit) > 2 * mean(hatvalues(fit)))
## [1] 344
outliers_iv_index <- which(hatvalues(fit) > 2 * mean(hatvalues(fit)))
sum(abs(rstandard(fit)) > 2)
## [1] 261
outliers_dv_index <- which(abs(rstandard(fit)) > 2)
sum(cooks.distance(fit) > 0.5)
## [1] 0
Fit model with outliers removed and compare it with the original model.
outliers_removed <- train_model[-c(outliers_iv_index, outliers_dv_index),]
fit_outliers_removed <- lm(diff_corrected_2 ~., data = outliers_removed)
summary(fit_outliers_removed)
##
## Call:
## lm(formula = diff_corrected_2 ~ ., data = outliers_removed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.1071 -5.9516 -0.4077 5.9053 22.3616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.173e+01 3.129e+00 -26.116 < 2e-16 ***
## n_observaties -2.487e-03 6.057e-03 -0.411 0.6814
## gemeente_aantal_inwoners -8.558e-07 1.020e-06 -0.839 0.4015
## gemeente_perc_inwoners_0_tot_15_jaar -8.704e-02 7.379e-02 -1.180 0.2382
## gemeente_bevolkingsdichtheid 3.188e-04 1.297e-04 2.457 0.0140 *
## ratio_leerling_leraar -1.908e-01 4.766e-02 -4.003 6.34e-05 ***
## gemiddelde_aanstelling 8.688e-01 1.673e+00 0.519 0.6037
## gemiddelde_leeftijd -3.030e-02 2.976e-02 -1.018 0.3088
## perc_man -2.267e-02 1.342e-02 -1.690 0.0911 .
## perc_tijdelijk_contract -3.471e-03 1.457e-02 -0.238 0.8117
## ratio_leraar_ondersteuner -5.893e-04 5.725e-03 -0.103 0.9180
## aantal_leerlingen -6.438e-04 2.320e-03 -0.278 0.7814
## schoolweging 1.192e+00 4.190e-02 28.453 < 2e-16 ***
## schoolweging_spreiding 5.757e-01 1.812e-01 3.176 0.0015 **
## prop_2F 8.071e+01 1.064e+00 75.849 < 2e-16 ***
## diff1 -4.049e-01 8.179e-03 -49.498 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.626 on 4788 degrees of freedom
## Multiple R-squared: 0.6191, Adjusted R-squared: 0.6179
## F-statistic: 518.8 on 15 and 4788 DF, p-value: < 2.2e-16
We now look at a less flexible model, the lasso regression. Before we make a lasso model, we need to create an X matrix and a y outcome vector.
## Train data
y <- train_model$diff_corrected_2
X <- train_model %>%
select(- diff_corrected_2) %>%
as.matrix()
## Test data
y_test <- test_model$diff_corrected_3
X_test <- test_model %>%
select(- diff_corrected_3) %>%
as.matrix()
lasso_fit <- cv.glmnet(X, y,
family = "gaussian", trace.it = 1, nfolds = 5,
intercept = F, alpha = 1)
## Training
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|======================================================================| 100%
## Fold: 1/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|======================================================================| 100%
## Fold: 2/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|======================================================================| 100%
## Fold: 3/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|======================================================================| 100%
## Fold: 4/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|======================================================================| 100%
## Fold: 5/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|======================================================================| 100%
print(lasso_fit)
##
## Call: cv.glmnet(x = X, y = y, nfolds = 5, trace.it = 1, family = "gaussian", intercept = F, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.0238 64 115.6 3.531 13
## 1se 0.5122 31 119.0 3.409 10
We use this model to predict the test set and calculate the RMSE. We also added a plot of the predicted and observed scores.
pred_lasso <- predict(lasso_fit, newx = X_test, s = "lambda.min")
RMSE_lasso <- RMSE(pred = pred_lasso, obs = y_test)
RMSE_lasso
## [1] 10.07046
plot(pred_lasso, y_test)
abline(0, 1)
Lastly we look at fitting a ridge model. It is similar to a lasso regression, with the difference being in the alpha constraint.
ridge_fit <- cv.glmnet(X, y,
family = "gaussian", trace.it = 1, nfolds = 5,
alpha = 0, intercept = F)
## Training
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
## Fold: 1/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
## Fold: 2/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
## Fold: 3/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
## Fold: 4/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
## Fold: 5/5
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
print(ridge_fit)
##
## Call: cv.glmnet(x = X, y = y, nfolds = 5, trace.it = 1, family = "gaussian", alpha = 0, intercept = F)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.8347 100 116.2 3.480 15
## 1se 2.5490 88 119.2 3.353 15
We use this model to predict the test data of 2018-2019. We also calculate the RMSE and plot the predicted scores and observerd scores.
pred_ridge <- predict(ridge_fit, X_test)
RMSE_ridge <- RMSE(pred = pred_ridge, obs = y_test)
RMSE_ridge
## [1] 10.29709
plot(pred_ridge, y_test)
abline(0, 1)
We also fit a classification tree. This is a model that is more flexible than an lm model.
trCntrl <- trainControl("cv", 5, allowParallel = TRUE)
fit_tree <- caret::train(x = X, y = y, method = "rpart", trControl = trCntrl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
fit_tree
## CART
##
## 5384 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4308, 4306, 4306, 4308, 4308
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.06070607 12.86789 0.2833116 10.03975
## 0.06430842 13.38301 0.2245544 10.44370
## 0.19404799 14.33378 0.1819956 11.20758
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.06070607.
We now fit this tree model to the test data.
pred_tree <- predict(fit_tree, newdata = X_test)
RMSE_tree <- RMSE(pred = pred_tree, obs = y_test)
RMSE_tree
## [1] 12.90546
plot(pred_tree, y_test)
abline(0, 1)
We now look at a different classification model: The random forest. Here we have to specify some parameters. For the mtry parameter we use the recommended amount, the sqrt() of the number of columns of the dataframe. For splitrule we use variance and minimal node size we set to 1.
mtry_ncol <- sqrt(ncol(X))
## Fit Random forrest using Caret::Train
fit_rf <- train(X, y, method = "ranger",
trControl = trCntrl,
tuneGrid = expand.grid(mtry = mtry_ncol,
splitrule = "variance", min.node.size = 1)
)
fit_rf$finalModel
## Ranger result
##
## Call:
## ranger::ranger(dependent.variable.name = ".outcome", data = x, mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size, splitrule = as.character(param$splitrule), write.forest = TRUE, probability = classProbs, ...)
##
## Type: Regression
## Number of trees: 500
## Sample size: 5384
## Number of independent variables: 15
## Mtry: 3
## Target node size: 1
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 111.9331
## R squared (OOB): 0.5152332
We use this model to predict and calculate the RMSE. We then show the prediction accuracy in a plot.
pred_rf <- predict(fit_rf, newdata = X_test)
RMSE_rf <- RMSE(pred = pred_rf, obs = y_test)
RMSE_rf
## [1] 10.19637
plot(pred_rf, y_test)
abline(0, 1)
Now that we have our various models, it is time to compare. We will be looking to see which of the models has the lowest RMSE - indicating the best predictions on the data from a year later.
RMSE_df <- data.frame(models = c("lm", "lasso",
"ridge", "decision tree", "random forest"),
RMSE = c(RMSE_lm, RMSE_lasso, RMSE_ridge, RMSE_tree, RMSE_rf))
RMSE_df <- RMSE_df %>%
mutate(min = min(RMSE) == RMSE)
## Plot showing best performance
RMSE_df %>%
arrange(RMSE) %>%
ggplot(aes(x = models, y = RMSE, fill = min)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(y = "RMSE",
x = "",
title = "Model comparison of test prediction accuracy",
subtitle = "using Root Mean Squared Error (RMSE) to compare",
caption = "Best model displayed in Red Data source: Inspectorate of Education") + #nolint
coord_flip(ylim = c(0, 15)) +
theme_minimal() +
scale_fill_manual(values = c("black", "Red")) +
geom_text(aes(label = round(RMSE, 3)), hjust = -0.5)
From this we can conclude that the best model to predict future data is the linear model using forward stepwise selection.
To answer another question that came up: How far ahead can we predict, using these models.
We will be answering this question by looking if our best performing model (approach) can be used to predict data 2 years ahead, instead of 1 year only. We first make new training data - using the same approach as before, but now for 2016-2017.
# now add some more data:
train_data2 <- d_personeel_tot %>%
filter(schooljaar == "2016-2017") %>%
right_join(tmp_file, by = c("brin_nummer"))
# now add some more data:
train_data2 <- d_weging %>%
filter(schooljaar == "2016-2017") %>%
right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer"))
train_data2 <- d_weging %>%
rename("schoolweging_2015_2016" = "schoolweging") %>%
filter(schooljaar == "2016-2017") %>%
right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer")) %>%
rename("schoolweging_2016_2017" = "schoolweging")
# missing data issues..
clean_train_data2 <- train_data2 %>%
filter(!is.na(`2015-2016`),
!is.na(`2016-2017`),
!is.na(schoolweging_2015_2016),
!is.na(schoolweging_2016_2017))
mod <- lm(`2015-2016` ~ 1 + schoolweging_2015_2016, data = clean_train_data2)
clean_train_data2$res_2015_2016 <- residuals(mod)
mod <- lm(`2016-2017` ~ 1 + schoolweging_2016_2017, data = clean_train_data2)
clean_train_data2$res_2016_2017 <- residuals(mod)
clean_train_data2$diff_corrected_1 <- clean_train_data2$res_2016_2017 -
clean_train_data2$res_2015_2016
df_train2 <- clean_train_data2 %>%
select(brin_nummer, vestigingsnummer, `2015-2016`, `2016-2017`, `2017-2018`,
`2018-2019`, diff1, diff2, diff3, res_2015_2016, res_2016_2017,
diff_corrected_1)
train_tot2 <- d_tot %>%
filter(schooljaar == "2016-2017") %>%
left_join(df_train2, by = c("brin_nummer", "vestigingsnummer")) %>%
mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))
train_tot2 <- train_tot2[complete.cases(train_tot2), ]
train_tot2 <- train_tot2 %>%
rename("y2015_2016" = "2015-2016",
"y2016_2017" = "2016-2017",
"y2017_2018" = "2017-2018",
"y2018_2019" = "2018-2019") %>%
mutate_if(is.numeric, ~ifelse(abs(.) == Inf, 0, .)) %>%
mutate_if(is.numeric, ~ifelse(abs(.) == -Inf, 0, .))
train_model2 <- train_tot2 %>%
select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer,
diff2, diff3, diff1,
# remove variables with high correlation
perc_2F, y2015_2016,
# these predictors make the r-squared 1
y2017_2018, y2018_2019, y2016_2017,
res_2015_2016, res_2016_2017))
colnames(train_model2)
## [1] "n_observaties"
## [2] "gemeente_aantal_inwoners"
## [3] "gemeente_perc_inwoners_0_tot_15_jaar"
## [4] "gemeente_bevolkingsdichtheid"
## [5] "ratio_leerling_leraar"
## [6] "gemiddelde_aanstelling"
## [7] "gemiddelde_leeftijd"
## [8] "perc_man"
## [9] "perc_tijdelijk_contract"
## [10] "ratio_leraar_ondersteuner"
## [11] "aantal_leerlingen"
## [12] "schoolweging"
## [13] "schoolweging_spreiding"
## [14] "prop_2F"
## [15] "diff_corrected_1"
We Use this model to train a new linear model.
nullmod2 <- lm(diff_corrected_1 ~ 1, data = train_model2)
fullmod2 <- lm(diff_corrected_1 ~., data = train_model2)
summary(fullmod2)
##
## Call:
## lm(formula = diff_corrected_1 ~ ., data = train_model2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.000 -8.844 -1.356 7.688 72.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.319e+01 4.664e+00 -17.836 < 2e-16 ***
## n_observaties -2.606e-03 8.213e-03 -0.317 0.750991
## gemeente_aantal_inwoners 3.597e-07 1.504e-06 0.239 0.811006
## gemeente_perc_inwoners_0_tot_15_jaar 5.247e-02 1.027e-01 0.511 0.609373
## gemeente_bevolkingsdichtheid -2.396e-04 1.871e-04 -1.281 0.200376
## ratio_leerling_leraar -1.059e-01 6.327e-02 -1.673 0.094386 .
## gemiddelde_aanstelling 2.923e+00 2.604e+00 1.123 0.261693
## gemiddelde_leeftijd -1.476e-02 4.464e-02 -0.331 0.740909
## perc_man -1.663e-02 2.009e-02 -0.828 0.407839
## perc_tijdelijk_contract 9.132e-02 2.272e-02 4.019 5.93e-05 ***
## ratio_leraar_ondersteuner -7.518e-03 6.049e-03 -1.243 0.213947
## aantal_leerlingen -1.666e-03 3.206e-03 -0.520 0.603356
## schoolweging 1.296e+00 6.886e-02 18.821 < 2e-16 ***
## schoolweging_spreiding 8.318e-01 2.465e-01 3.374 0.000745 ***
## prop_2F 7.056e+01 1.530e+00 46.131 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.78 on 5387 degrees of freedom
## Multiple R-squared: 0.2875, Adjusted R-squared: 0.2857
## F-statistic: 155.3 on 14 and 5387 DF, p-value: < 2.2e-16
finalmod2 <- MASS::stepAIC(nullmod2,
scope = list(upper = fullmod2),
direction = "forward", trace = FALSE)
finalmod2
##
## Call:
## lm(formula = diff_corrected_1 ~ prop_2F + schoolweging + perc_tijdelijk_contract +
## schoolweging_spreiding + aantal_leerlingen + ratio_leerling_leraar,
## data = train_model2)
##
## Coefficients:
## (Intercept) prop_2F schoolweging
## -82.213529 70.714035 1.316669
## perc_tijdelijk_contract schoolweging_spreiding aantal_leerlingen
## 0.090929 0.778440 -0.002867
## ratio_leerling_leraar
## -0.090165
summary(finalmod2)
##
## Call:
## lm(formula = diff_corrected_1 ~ prop_2F + schoolweging + perc_tijdelijk_contract +
## schoolweging_spreiding + aantal_leerlingen + ratio_leerling_leraar,
## data = train_model2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.395 -8.928 -1.376 7.695 72.210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -82.213529 3.347211 -24.562 < 2e-16 ***
## prop_2F 70.714035 1.524110 46.397 < 2e-16 ***
## schoolweging 1.316669 0.066531 19.790 < 2e-16 ***
## perc_tijdelijk_contract 0.090929 0.021318 4.265 2.03e-05 ***
## schoolweging_spreiding 0.778440 0.237846 3.273 0.00107 **
## aantal_leerlingen -0.002867 0.001431 -2.003 0.04520 *
## ratio_leerling_leraar -0.090165 0.061985 -1.455 0.14583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.77 on 5395 degrees of freedom
## Multiple R-squared: 0.2868, Adjusted R-squared: 0.286
## F-statistic: 361.6 on 6 and 5395 DF, p-value: < 2.2e-16
As we can see this model - with a R-squared of .29 - explains a lot less variance in the outcome variable than the previous model - with a R-squared of .55. A possible explanation for this is the fact that the first model used the difference score of the previous year as one of its predictor variables, something that cannot be used for this model using the first possible difference score.
We will now try to predict the scores 2 years later, using this model.
pred_lm2 <- predict(finalmod2, newdata = test_model)
RMSE_lm2 <- RMSE(pred = pred_lm2, obs = test_model$diff_corrected_3)
RMSE_lm2
## [1] 12.50975
tibble(pred_lm = pred_lm2, diff_corrected_3 = test_model$diff_corrected_3) %>%
ggplot(aes(x = pred_lm, y = diff_corrected_3)) +
geom_point(alpha = .2) +
geom_abline() +
labs(title = "Predictions vs Observations", y = "Corrected Difference of
Observations (2018/2019 - 2017/2018)", x = "LM Predictions",
subtitle = "Using data to predict 2 years into the future",
caption = "Data Source: Inspectorate of Education") +
theme_minimal() +
geom_text(x = 30, y = -30, size = 4,
label = paste("RMSE = ", round(RMSE_lm2, 3)))
As we can see the RMSE is quite a bit lower for data predicted two years ahead. It scored just above the decision tree (which used a categorical classification model, so it is natural that it performs worse than a continuous prediction model).
lintr::lint(filename = rstudioapi::getSourceEditorContext()$path)
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:635:1: style: Variable and function name style should be snake_case.
## getNearZeroVariationPredictors <- function(df) {
## ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:716:1: style: Variable and function name style should be snake_case.
## RMSE_lm <- RMSE(pred = pred_lm, obs = test_model$diff_corrected_3)
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:775:2: style: Commented code should be removed.
## #shp <- shapiro.test(resid(fit)) # if significant assumption is violated
## ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:792:55: style: Trailing whitespace is superfluous.
## abline(v = 2 * mean(hatvalues(fit)), col = 3, lty = 3)
## ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:793:47: style: Trailing whitespace is superfluous.
## sum(hatvalues(fit) > 2 * mean(hatvalues(fit)))
## ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:794:70: style: Trailing whitespace is superfluous.
## outliers_iv_index <- which(hatvalues(fit) > 2 * mean(hatvalues(fit)))
## ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:816:74: style: Commas should always have a space after.
## outliers_removed <- train_model[-c(outliers_iv_index, outliers_dv_index),]
## ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:828:1: style: Variable and function name style should be snake_case.
## X <- train_model %>%
## ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:834:1: style: Variable and function name style should be snake_case.
## X_test <- test_model %>%
## ^~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:853:1: style: Variable and function name style should be snake_case.
## RMSE_lasso <- RMSE(pred = pred_lasso, obs = y_test)
## ^~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:877:1: style: Variable and function name style should be snake_case.
## RMSE_ridge <- RMSE(pred = pred_ridge, obs = y_test)
## ^~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:888:1: style: Variable and function name style should be snake_case.
## trCntrl <- trainControl("cv", 5, allowParallel = TRUE)
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:898:1: style: Variable and function name style should be snake_case.
## RMSE_tree <- RMSE(pred = pred_tree, obs = y_test)
## ^~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:924:1: style: Variable and function name style should be snake_case.
## RMSE_rf <- RMSE(pred = pred_rf, obs = y_test)
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:933:1: style: Variable and function name style should be snake_case.
## RMSE_df <- data.frame(models = c("lm", "lasso",
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:937:1: style: Variable and function name style should be snake_case.
## RMSE_df <- RMSE_df %>%
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:1052:1: style: Variable and function name style should be snake_case.
## RMSE_lm2 <- RMSE(pred = pred_lm2, obs = test_model$diff_corrected_3)
## ^~~~~~~~